Pre-visualization of the normalized count data
Table of content
Generated from a Jupyter Notebook - Sources
source("load_libraries.R")
source("functions.R")
load("../results/dge/norm_counts.RData")
load("../results/dge/z_scores.RData")
load("../results/dge/metadata.RData")
metadata
trait = list()
i = 'Groups'
trait[[i]] = metadata %>%
mutate("SPF" = as.integer(Microbiota == "SPF")) %>%
mutate("GF" = as.integer(Microbiota == "GF")) %>%
mutate("Young" = as.integer(Age == "Young")) %>%
mutate("Middle-aged" = as.integer(Age == "Middle-aged")) %>%
mutate("Old" = as.integer(Age == "Old")) %>%
mutate("Male" = as.integer(Sex == "Male")) %>%
mutate("Female" = as.integer(Sex == "Female")) %>%
select(-c(Microbiota, Age, Sex, project))
i = 'Microbiota / Age'
trait[[i]] = metadata %>%
mutate("SPF / Young" = as.integer(Microbiota == "SPF" & Age == "Young")) %>%
mutate("SPF / Middle-aged" = as.integer(Microbiota == "SPF" & Age == "Middle-aged")) %>%
mutate("SPF / Old" = as.integer(Microbiota == "SPF" & Age == "Old")) %>%
mutate("GF / Young" = as.integer(Microbiota == "GF" & Age == "Young")) %>%
mutate("GF / Middle-aged" = as.integer(Microbiota == "GF" & Age == "Middle-aged")) %>%
mutate("GF / Old" = as.integer(Microbiota == "GF" & Age == "Old")) %>%
select(-c(Microbiota, Age, Sex, project))
i = 'Microbiota / Age / Sex'
trait[[i]] = metadata %>%
mutate("SPF / Young / Male" = as.integer(Microbiota == "SPF" & Age == "Young" & Sex == "Male")) %>%
mutate("SPF / Young / Female" = as.integer(Microbiota == "SPF" & Age == "Young" & Sex == "Female")) %>%
mutate("SPF / Middle-aged / Male" = as.integer(Microbiota == "SPF" & Age == "Middle-aged" & Sex == "Male")) %>%
mutate("SPF / Middle-aged / Female" = as.integer(Microbiota == "SPF" & Age == "Middle-aged" & Sex == "Female")) %>%
mutate("SPF / Old / Male" = as.integer(Microbiota == "SPF" & Age == "Old" & Sex == "Male")) %>%
mutate("SPF / Old / Female" = as.integer(Microbiota == "SPF" & Age == "Old" & Sex == "Female")) %>%
mutate("GF / Young / Male" = as.integer(Microbiota == "GF" & Age == "Young" & Sex == "Male")) %>%
mutate("GF / Young / Female" = as.integer(Microbiota == "GF" & Age == "Young" & Sex == "Female")) %>%
mutate("GF / Middle-aged / Male" = as.integer(Microbiota == "GF" & Age == "Middle-aged" & Sex == "Male")) %>%
mutate("GF / Middle-aged / Female" = as.integer(Microbiota == "GF" & Age == "Middle-aged" & Sex == "Female")) %>%
mutate("GF / Old / Male" = as.integer(Microbiota == "GF" & Age == "Old" & Sex == "Male")) %>%
mutate("GF / Old / Female" = as.integer(Microbiota == "GF" & Age == "Old" & Sex == "Female")) %>%
select(-c(Microbiota, Age, Sex, project))
i = 'Age / Microbiota'
trait[[i]] = metadata %>%
mutate("Young / SPF" = as.integer(Age == "Young" & Microbiota == "SPF")) %>%
mutate("Young / GF" = as.integer(Age == "Young" & Microbiota == "GF")) %>%
mutate("Middle-aged / SPF" = as.integer(Age == "Middle-aged" & Microbiota == "SPF")) %>%
mutate("Middle-aged / GF" = as.integer(Age == "Middle-aged" & Microbiota == "GF")) %>%
mutate("Old / SPF" = as.integer(Age == "Old" & Microbiota == "SPF")) %>%
mutate("Old / GF" = as.integer(Age == "Old" & Microbiota == "GF")) %>%
select(-c(Microbiota, Age, Sex, project))
i = 'Age / Microbiota / Sex'
trait[[i]] = metadata %>%
mutate("Young / SPF / Male" = as.integer(Age == "Young" & Microbiota == "SPF" & Sex == "Male")) %>%
mutate("Young / SPF / Female" = as.integer(Age == "Young" & Microbiota == "SPF" & Sex == "Female")) %>%
mutate("Young / GF / Male" = as.integer(Age == "Young" & Microbiota == "GF" & Sex == "Male")) %>%
mutate("Young / GF / Female" = as.integer(Age == "Young" & Microbiota == "GF" & Sex == "Female")) %>%
mutate("Middle-aged / SPF / Male" = as.integer(Age == "Middle-aged" & Microbiota == "SPF" & Sex == "Male")) %>%
mutate("Middle-aged / SPF / Female" = as.integer(Age == "Middle-aged" & Microbiota == "SPF" & Sex == "Female")) %>%
mutate("Middle-aged / GF / Male" = as.integer(Age == "Middle-aged" & Microbiota == "GF" & Sex == "Male")) %>%
mutate("Middle-aged / GF / Female" = as.integer(Age == "Middle-aged" & Microbiota == "GF" & Sex == "Female")) %>%
mutate("Old / SPF / Male" = as.integer(Age == "Old" & Microbiota == "SPF" & Sex == "Male")) %>%
mutate("Old / SPF / Female" = as.integer(Age == "Old" & Microbiota == "SPF" & Sex == "Female")) %>%
mutate("Old / GF / Male" = as.integer(Age == "Old" & Microbiota == "GF" & Sex == "Male")) %>%
mutate("Old / GF / Female" = as.integer(Age == "Old" & Microbiota == "GF" & Sex == "Female")) %>%
select(-c(Microbiota, Age, Sex, project))
i = 'Sex / Age'
trait[[i]] = metadata %>%
mutate("Male / Young" = as.integer(Sex == "Male" & Age == "Young")) %>%
mutate("Male / Middle-aged" = as.integer(Sex == "Male" & Age == "Middle-aged")) %>%
mutate("Male / Old" = as.integer(Sex == "Male" & Age == "Old")) %>%
mutate("Female / Young" = as.integer(Sex == "Female" & Age == "Young")) %>%
mutate("Female / Middle-aged" = as.integer(Sex == "Female" & Age == "Middle-aged")) %>%
mutate("Female / Old" = as.integer(Sex == "Female" & Age == "Old")) %>%
select(-c(Microbiota, Age, Sex, project))
i = 'Sex / Microbiota'
trait[[i]] = metadata %>%
mutate("Male / SPF" = as.integer(Sex == "Male" & Microbiota == "SPF")) %>%
mutate("Male / GF" = as.integer(Sex == "Male" & Microbiota == "GF")) %>%
mutate("Female / SPF" = as.integer(Sex == "Female" & Microbiota == "SPF")) %>%
mutate("Female / GF" = as.integer(Sex == "Female" & Microbiota == "GF")) %>%
select(-c(Microbiota, Age, Sex, project))
col_order = list()
annot_col = list()
col_order$msa = c(grep("SPF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE))
annot_col$msa = as.data.frame(metadata %>%
select(c("sample","Age", "Sex", "Microbiota")) %>%
mutate(Age = factor(Age, c("Young", "Middle-aged", "Old"))) %>%
column_to_rownames('sample'))
col_order$mas = c(grep("SPF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE))
annot_col$mas = as.data.frame(metadata %>%
select(c("sample", "Sex", "Age", "Microbiota")) %>%
mutate(Age = factor(Age, c("Young", "Middle-aged", "Old"))) %>%
column_to_rownames('sample'))
col_order$sma = c(grep("SPF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE))
annot_col$sma = as.data.frame(metadata %>%
select(c("sample", "Age","Microbiota", "Sex")) %>%
mutate(Age = factor(Age, c("Young", "Middle-aged", "Old"))) %>%
column_to_rownames('sample'))
col_order$sam = c(grep("SPF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE))
annot_col$sam = as.data.frame(metadata %>%
select(c("sample", "Microbiota", "Age", "Sex")) %>%
mutate(Age = factor(Age, c("Young", "Middle-aged", "Old"))) %>%
column_to_rownames('sample'))
col_order$ams = c(grep("SPF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE))
annot_col$ams = as.data.frame(metadata %>%
select(c("sample", "Sex", "Microbiota", "Age")) %>%
mutate(Age = factor(Age, c("Young", "Middle-aged", "Old"))) %>%
column_to_rownames('sample'))
col_order$asm = c(grep("SPF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_8w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_52w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_M_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("SPF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE),
grep("GF_104w_F_+", colnames(norm_counts), perl=TRUE, value=TRUE))
annot_col$asm = as.data.frame(metadata %>%
select(c("sample", "Microbiota", "Sex", "Age")) %>%
mutate(Age = factor(Age, c("Young", "Middle-aged", "Old"))) %>%
column_to_rownames('sample'))
annot_colors = list()
annot_colors$Microbiota = c("#65ce41","#c3a7f9")
names(annot_colors$Microbiota) = c("SPF", "GF")
annot_colors$Age = c("#ff82ec", "#00dcb2", "#e2b33a")
names(annot_colors$Age) = c("Young", "Middle-aged", "Old")
annot_colors$Sex = c("#ff918b", "#00d2fb")
names(annot_colors$Sex) = c("Female", "Male")
save(annot_colors, file="../results/dge/annot_colors.RData")
List of genes to check: Egr1, Jun, Zfp36l1, Malat1, Dusp1, Nr4a1, Fos
facs_genes = c('Egr1', 'Jun', 'Zfp36l1', 'Malat1', 'Dusp1', 'Nr4a1', 'Fos')
Normalized counts
plot_heatmap(norm_counts,
facs_genes,
col_order$msa,
annot_col$msa,
show_rownames=TRUE)
Z-scores
pheatmap(z_scores[facs_genes, col_order$msa],
cluster_rows=F,
cluster_cols=F,
show_rownames=T,
show_colnames=F,
annotation_col=as.data.frame(annot_col$msa),
annotation_row=NULL,
annotation_colors=NULL,
color=rev(brewer.pal(11, "RdBu")),
breaks=seq(-3.5, 3.5, length=12))
core_signature_genes = list()
core_signature_genes$microglia = c('Tmem119', 'Hexb', 'Scl2a5', 'P2ry12', 'Siglech', 'Trem2', 'P2ry13')
core_signature_genes$macrophage = c('Mrc1', 'Cd163', 'Lyve1', 'Siglec1', 'Pf4')
core_signature_genes$monocyte = c('Ly6c2', 'Ccr2', 'Anxa8', 'Nr4a1', 'Plac8')
core_signature_genes$dc = c('Flt3', 'Zbtb46', 'Itgae', 'Batf3', 'Clec9a')
core_signature_genes$granulocyte = c('Mpo', 'Ngp', 'Wfdc17', 'Ly6g')
core_signature_genes$t = c('Trbc1', 'Trbc2', 'Cd3e', 'Cd3d', 'Nkg7')
core_signature_genes$b= c('Cd79a', 'Igkc', 'Ighm', 'Cd19', 'Ebf1')
core_signature_genes$mast = c('Tpsb2', 'Mcpt4', 'Cma1', 'Cpa3')
core_signature_genes$nk = c('NKp46', 'NK1.1', 'NKG2D', 'Gzmb', 'Eomes', 'Tbet')
core_signature_gene_df = melt(core_signature_genes) %>%
rename(genes = value) %>%
rename(cell = L1)
removed_genes = core_signature_gene_df %>%
filter(!(genes %in% rownames(z_scores)))
core_signature_gene_df = core_signature_gene_df %>%
filter(genes %in% rownames(z_scores))
core_signature_gene_df
Genes in list, but not in the counts
removed_genes
Normalized counts
cell_annot_colors = annot_colors
cell_annot_colors$Cell = c("#56a4d0", "#d5682b", "#7b6fcb", "#a49f3d", "#c356b8", "#59a85f", "#d04659", "#b7794a", "#c0668e")
names(cell_annot_colors$Cell) = c("microglia", "macrophage", "monocyte", "dc", "granulocyte", "t", "b", "mast", "nk")
data = norm_counts[as.vector(core_signature_gene_df$genes), col_order$msa]
breaks = quantile_breaks(data, n = 11)
pheatmap(data,
cluster_rows=F,
cluster_cols=F,
show_rownames=T,
show_colnames=F,
annotation_colors=cell_annot_colors,
annotation_col=annot_col$msa,
annotation_row=data.frame(Cell=core_signature_gene_df$cell, row.names=core_signature_gene_df$genes),
breaks=breaks,
color=inferno(10))
pdf('../results/dge/cell_type_gene_norm_counts.pdf')
pheatmap(data,
cluster_rows=F,
cluster_cols=F,
show_rownames=T,
show_colnames=F,
annotation_colors=cell_annot_colors,
annotation_col=annot_col$msa,
annotation_row=data.frame(Cell=core_signature_gene_df$cell, row.names=core_signature_gene_df$genes),
breaks=breaks,
color=inferno(10))
dev.off()
Z-score
pheatmap(z_scores[as.vector(core_signature_gene_df$genes), col_order$msa],
cluster_rows=F,
cluster_cols=F,
show_rownames=T,
show_colnames=F,
annotation_col=annot_col$msa,
annotation_row=data.frame(cell=core_signature_gene_df$cell, row.names=core_signature_gene_df$genes),
annotation_colors = cell_annot_colors,
color=rev(brewer.pal(11, "RdBu")),
breaks = seq(-3.5, 3.5, length=12))
Genes that could be artefact due to FACS sorting or contamination from other cell types and samples that seem contaminated: 'GF_8w_M_2_2', 'SPF_52w_F_1_2', 'SPF_104w_M_3_2'
potentially_contaminated_samples = c('GF_8w_M_2_2', 'SPF_52w_F_1_2', 'SPF_104w_M_3_2')
Filter the normalized counts (value before and )
original_norm_counts = norm_counts
norm_counts = data.frame(norm_counts) %>%
rownames_to_column('genes') %>%
filter(!genes %in% facs_genes) %>%
filter(!genes %in% core_signature_genes$macrophage) %>%
filter(!genes %in% core_signature_genes$monocyte) %>%
filter(!genes %in% core_signature_genes$dc) %>%
filter(!genes %in% core_signature_genes$granulocyte) %>%
filter(!genes %in% core_signature_genes$t) %>%
filter(!genes %in% core_signature_genes$b) %>%
filter(!genes %in% core_signature_genes$mast) %>%
filter(!genes %in% core_signature_genes$nk) %>%
filter(!genes %in% core_signature_genes$nk) %>%
select(-c(GF_8w_M_2_2, SPF_52w_F_1_2, SPF_104w_M_3_2)) %>%
filter_at(vars(-genes), any_vars(. > 0)) %>%
column_to_rownames('genes')
dim(original_norm_counts)[1]-dim(norm_counts)[1]
save(norm_counts, file="../results/dge/filtered_norm_counts.RData")
write.table(norm_counts, "../results/dge/filtered_norm_counts", sep = "\t", quote = FALSE)
mean_counts = apply(norm_counts, 1, mean)
sd_counts = apply(norm_counts, 1, sd)
z_scores = (norm_counts - mean_counts)/sd_counts
save(z_scores, file="../results/dge/filtered_z_scores.RData")
write.table(z_scores, "../results/dge/filtered_z_scores", sep = "\t", quote = FALSE)
Filter the samples from annotations and metadata
for(i in 1:length(trait)){
trait[[i]] = trait[[i]] %>%
filter(!sample %in% potentially_contaminated_samples)
}
save(trait, file="../results/dge/trait.RData")
col_order = lapply(col_order, function(x) return(x[!x %in% potentially_contaminated_samples]))
annot_col = lapply(annot_col, function(x) return(x[!rownames(x) %in% potentially_contaminated_samples,]))
save(col_order, file="../results/dge/col_order.RData")
save(annot_col, file="../results/dge/annot_col.RData")
metadata = metadata %>%
filter(!sample %in% potentially_contaminated_samples)
save(metadata, file="../results/dge/filtered_metadata.RData")
all_deg_genes = c()
save(all_deg_genes, file="../results/dge/all_deg_genes.RData")
sample_dist = t(norm_counts) %>% dist
sample_dist_matrix = as.matrix(sample_dist)
list_colors = unlist(annot_colors)
names(list_colors) = gsub("(Microbiota|Sex|Age)\\.","",names(list_colors))
colors = metadata %>%
mutate(Microbiota = gsub("GF", list_colors['GF'], Microbiota)) %>%
mutate(Microbiota = gsub("SPF", list_colors['SPF'], Microbiota)) %>%
mutate(Age = gsub("Old", list_colors['Old'], Age)) %>%
mutate(Age = gsub("Middle-aged", list_colors['Middle-aged'], Age)) %>%
mutate(Age = gsub("Young", list_colors['Young'], Age)) %>%
mutate(Sex = gsub("Female", list_colors['Female'], Sex)) %>%
mutate(Sex = gsub("Male", list_colors['Male'], Sex)) %>%
select(c("Microbiota", "Age", "Sex", "sample")) %>%
column_to_rownames('sample')
plot_clustering_tree = function(sampleTree) {
par(mar = c(4,4,1,1))
par(cex = 0.6)
plot(sampleTree, leaflab="none")
colored_bars(colors, sampleTree, rowLabels=colnames(colors), y_shift=0.5)
legend("topright", fill=unlist(list_colors), names(list_colors), cex=0.7)
}
sample_clust = sample_dist %>% hclust(method = "ward.D2")
sample_clust_tree = sample_clust %>% as.dendrogram
plot_clustering_tree(sample_clust_tree)
pdf('../results/dge/dendogram.pdf')
plot_clustering_tree(sample_clust_tree)
dev.off()
Extract genes from chrX / chrX_GL456233_random / chrY
annot = read.table("../data/mm10_UCSC_07_15_genes.gtf", sep="\t")
chrX_chrY_genes = annot %>%
filter(V1 == 'chrX' | V1 == 'chrX_GL456233_random' | V1 == 'chrY') %>%
select(V9) %>%
tidyr::separate(V9, into = c("v1", "gene_id", "v2", "gene_name", "v3", "p_id", "v4", "transcript_id", "v5", "tss_id"), sep = " |;") %>%
select(gene_id) %>%
distinct() %>%
pull(gene_id)
length(chrX_chrY_genes)
Remove genes (number displayed) from the count table
chrX_chrY_counts = norm_counts %>%
rownames_to_column('genes') %>%
filter(genes %in% chrX_chrY_genes)
dim(chrX_chrY_counts)[1]
non_chrX_chrY_counts = norm_counts %>%
rownames_to_column('genes') %>%
filter(!genes %in% chrX_chrY_genes) %>%
column_to_rownames('genes')
Clustering method: Ward D2
sampleTree = t(non_chrX_chrY_counts) %>% dist %>% hclust(method = "ward.D2") %>% as.dendrogram
plot_clustering_tree(sampleTree)
Elbow method to compute the number of cluster
fviz_nbclust(t(norm_counts), diss=sample_dist, FUN=hcut, method="wss")
Average silhouette method to compute the number of cluster
fviz_nbclust(t(norm_counts), diss=sample_dist, FUN=hcut, method="silhouette")
kmeans with 6 clusters
sample_clust_km = kmeans(sample_dist, 6)
Within cluster sum of square (how similar each individual, within the cluster are. The smaller the number, the more similar inidivduals are)
sample_clust_km$withinss
fviz_cluster(sample_clust_km, t(norm_counts))
clust_metadata = metadata %>%
select(-project) %>%
add_column(Cluster = as.factor(sample_clust_km$cluster)) %>%
column_to_rownames('sample')
clust_size = count(clust_metadata, Cluster) %>%
rename( Total = n)
count(clust_metadata, Cluster, Microbiota) %>%
spread(Microbiota, n, fill = 0) %>%
full_join(clust_size, by='Cluster')
count(clust_metadata, Cluster, Sex) %>%
spread(Sex, n, fill = 0) %>%
full_join(clust_size, by='Cluster')
count(clust_metadata, Cluster, Age) %>%
spread(Age, n, fill = 0) %>%
full_join(clust_size, by='Cluster')
count(clust_metadata, Cluster, Microbiota, Sex) %>%
mutate(MS = paste(Microbiota, Sex, sep=' - ')) %>%
select(-c(Microbiota, Sex)) %>%
spread(MS, n, fill = 0) %>%
full_join(clust_size, by='Cluster')
count(clust_metadata, Cluster, Sex, Age) %>%
mutate(SA = paste(Sex, Age, sep=' - ')) %>%
select(-c(Sex, Age)) %>%
spread(SA, n, fill = 0) %>%
full_join(clust_size, by='Cluster')
count(clust_metadata, Cluster, Age, Microbiota) %>%
mutate(AM = paste(Age, Microbiota, sep=' - ')) %>%
select(-c(Age, Microbiota)) %>%
spread(AM, n, fill = 0) %>%
full_join(clust_size, by='Cluster')
count(clust_metadata, Cluster, Microbiota, Sex, Age) %>%
mutate(s = paste(Sex, Age, Microbiota, sep=' - ')) %>%
select(-c(Microbiota, Sex, Age)) %>%
spread(s, n, fill = 0) %>%
full_join(clust_size, by='Cluster')
Ward D2 clustering for the rows and cols
pheatmap(sample_dist_matrix,
show_rownames=F,
show_colnames=F,
cluster_cols=sample_clust,
cluster_rows=sample_clust,
annotation_col=annot_col$sam,
annotation_row=annot_col$sam,
annotation_colors=annot_colors,
col=colorRampPalette(rev(brewer.pal(9, "Blues")))(255))
pca_data = t(norm_counts)
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Microbiota', size = 5) +
scale_color_manual(values = annot_colors$Microbiota)
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Age', size = 5) +
scale_color_manual(values = annot_colors$Age)
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Sex', size = 5) +
scale_color_manual(values = annot_colors$Sex)
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Cluster', size = 5)
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Cluster', shape='Sex', size = 5)
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Cluster', shape='Microbiota', size = 5)
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Cluster', shape='Age', size = 5)
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Age', shape = 'Sex', size = 5) +
scale_color_manual(values = annot_colors$Age)
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Age', shape = 'Microbiota', size = 5) +
scale_color_manual(values = annot_colors$Age)
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Sex', shape = 'Microbiota', size = 5) +
scale_color_manual(values = annot_colors$Sex)
plot_SPF_GF_PCA = function(g, a) {
pca_metadata = metadata %>%
filter(Sex == g & Age == a)
pca_data = t(norm_counts)[clust_metadata$sample,]
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Microbiota', size = 6) +
scale_color_manual(values = annot_colors$Microbiota)
ggsave(paste("../results/dge/spf_gf_pca/", g, "_", a ,".pdf", sep=""))
autoplot(prcomp(pca_data), data = clust_metadata, colour = 'Microbiota', size = 6) +
scale_color_manual(values = annot_colors$Microbiota)
}
Male & Young
plot_SPF_GF_PCA('Male', 'Young')
Male & Middle-aged
plot_SPF_GF_PCA('Male', 'Middle-aged')
Male & Old
plot_SPF_GF_PCA('Male', 'Old')
Female & Young
plot_SPF_GF_PCA('Female', 'Young')
Female & Middle-aged
plot_SPF_GF_PCA('Female', 'Middle-aged')
Female & Old
plot_SPF_GF_PCA('Female', 'Old')
plot_z_score_heatmap(z_scores,
rownames(z_scores),
col_order$msa,
annot_col$msa,
"All genes",
col_order$msa)
pdf('../results/dge/previsualization/z_score_msa.pdf')
plot_z_score_heatmap(z_scores,
rownames(z_scores),
col_order$msa,
annot_col$msa,
"All genes",
col_order$msa)
dev.off()
genes = c(
'Ero1lb', 'D16Ertd472e', 'Mcur1', 'Gpr137b-ps', 'Lonrf3', 'G530011O06Rik',
'1700112E06Rik', 'Ly86', 'Cd52', 'Sdc3', 'Dynlt1b', 'Tuba4a', 'Sult1a1',
'Notch4', 'Nnt', 'Plcd3', 'Rab4a', 'Wdfy1', 'Ctse', 'Gpr137b', 'Htra3')
Column order: microbiota - sex - age
pheatmap(z_scores[genes,col_order$msa],
cluster_rows=F,
cluster_cols=F,
show_rownames=T,
show_colnames=F,
annotation_col=annot_col$msa,
annotation_row=NULL,
annotation_colors = NULL,
color=rev(brewer.pal(11, "RdBu")),
breaks = seq(-3.5, 3.5, length=12),
main ='', cellheight = 12)
pdf('../results/dge/previsualization/gf_signature_genes_z_scores.pdf')
pheatmap(z_scores[genes,col_order$msa],
cluster_rows=F,
cluster_cols=F,
show_rownames=T,
show_colnames=F,
annotation_col=annot_col$msa,
annotation_row=NULL,
annotation_colors = NULL,
color=rev(brewer.pal(11, "RdBu")),
breaks = seq(-3.5, 3.5, length=12),
main ='', cellheight = 12)
dev.off()
genes = c(
'Aen', 'Ap5z1', 'Apc', 'Ascc3', 'Atm', 'Atr', 'Bod1l', 'Ccnd1', 'Cep164', 'Chd2', 'Crebbp', 'Dclre1c', 'Ddx5',
'Dot1l', 'Endov', 'Ercc5', 'Fam168a', 'Fbxo5', 'Foxo3', 'Gadd45g', 'Hdac10', 'Herc2', 'Hipk2', 'Hspa1a', 'Htra2',
'Huwe1', 'Ino80', 'Ino80d', 'Marf1', 'Mif', 'Mlh3', 'Nfrkb', 'Nhej1', 'Parp3', 'Phf1', 'Pidd1', 'Pik3r1', 'Pms2',
'Polg2', 'Poli', 'Pot1b', 'Ppp1r10', 'Prkdc', 'Psme4', 'Rad9b', 'Rbbp6', 'Recql5', 'Rev1', 'Setd2', 'Smg1', 'Stk11',
'Stxbp4', 'Supt16', 'Taok1', 'Taok2', 'Trrap', 'Ubr5', 'Uimc1', 'Uvssa', 'Xrcc3', 'Zc3h12a', 'Zfyve26')
Column order: microbiota - sex - age
pheatmap(z_scores[genes,col_order$msa],
cluster_rows=F,
cluster_cols=F,
show_rownames=T,
show_colnames=F,
annotation_col=annot_col$msa,
annotation_row=NULL,
annotation_colors = NULL,
color=rev(brewer.pal(11, "RdBu")),
breaks = seq(-3.5, 3.5, length=12),
main ='',
fontsize_row=8)
pdf('../results/dge/previsualization/dna_damage_repair_genes_z_scores.pdf')
pheatmap(z_scores[genes,col_order$msa],
cluster_rows=F,
cluster_cols=F,
show_rownames=T,
show_colnames=F,
annotation_col=annot_col$msa,
annotation_row=NULL,
annotation_colors = NULL,
color=rev(brewer.pal(11, "RdBu")),
breaks = seq(-3.5, 3.5, length=12),
main ='',
fontsize_row=8)
dev.off()
pheatmap(z_scores[genes,col_order$msa],
cluster_rows=F,
cluster_cols=F,
show_rownames=F,
show_colnames=F,
annotation_col=annot_col$msa,
annotation_row=NULL,
annotation_colors = NULL,
color=rev(brewer.pal(11, "RdBu")),
breaks = seq(-3.5, 3.5, length=12),
main ='')
pdf('../results/dge/previsualization/dna_damage_repair_genes_z_scores_wo_names.pdf')
pheatmap(z_scores[genes,col_order$msa],
cluster_rows=F,
cluster_cols=F,
show_rownames=F,
show_colnames=F,
annotation_col=annot_col$msa,
annotation_row=NULL,
annotation_colors = NULL,
color=rev(brewer.pal(11, "RdBu")),
breaks = seq(-3.5, 3.5, length=12),
main ='')
dev.off()
Weighted gene co-expression network analysis using WGCNA package
Keep only genes that have a count >= 10 in more than 90% of the samples (number removed / kept displayed)
filtered_norm_counts = data.frame(original_norm_counts) %>%
rownames_to_column('genes') %>%
mutate(rowsum = rowSums(select(., -genes) >= 10)) %>%
filter(rowsum >= 0.9*dim(original_norm_counts)[2]) %>%
select(-rowsum) %>%
filter(!genes %in% facs_genes) %>%
filter(!genes %in% core_signature_genes$macrophage) %>%
filter(!genes %in% core_signature_genes$monocyte) %>%
filter(!genes %in% core_signature_genes$dc) %>%
#filter(!genes %in% core_signature_genes$granulocyte) %>%
filter(!genes %in% core_signature_genes$t) %>%
filter(!genes %in% core_signature_genes$b) %>%
filter(!genes %in% core_signature_genes$mast) %>%
filter(!genes %in% core_signature_genes$nk) %>%
filter(!genes %in% c('Calm1')) %>%
select(-c(GF_8w_M_2_2, SPF_52w_F_1_2, SPF_104w_M_3_2)) %>%
column_to_rownames('genes')
dim(original_norm_counts)[1] - dim(filtered_norm_counts)[1]
dim(filtered_norm_counts)[1]
#filtered_norm_counts = data.frame(norm_counts) %>%
# rownames_to_column('genes') %>%
# mutate(rowsum = rowSums(select(., -genes) >= 10)) %>%
# filter(rowsum >= 0.9*dim(original_norm_counts)[2]) %>%
# select(-rowsum) %>%
# #filter(!genes %in% facs_genes) %>%
# #filter(!genes %in% core_signature_genes$macrophage) %>%
# #filter(!genes %in% core_signature_genes$monocyte) %>%
# #filter(!genes %in% core_signature_genes$dc) %>%
# #filter(!genes %in% core_signature_genes$granulocyte) %>%
# #filter(!genes %in% core_signature_genes$t) %>%
# #filter(!genes %in% core_signature_genes$b) %>%
# #filter(!genes %in% core_signature_genes$mast) %>%
# #filter(!genes %in% core_signature_genes$nk) %>%
# filter(!genes %in% c('Calm1')) %>%
# #select(-c(GF_8w_M_2_2, SPF_52w_F_1_2, SPF_104w_M_3_2)) %>%
# column_to_rownames('genes')
#dim(original_norm_counts)[1] - dim(filtered_norm_counts)[1]
#dim(filtered_norm_counts)[1]
dim(filtered_norm_counts)
Analysis of scale free topology for multiple soft thresholding powers, with signed hybrid network type
networkType = "signed hybrid"
powers = c(seq(1, 10, by = 1), seq(12, 20, by = 2))
# Choosing the soft-thresholding power
sft = pickSoftThreshold(t(filtered_norm_counts),
powerVector = powers,
dataIsExpr = T,
RsquaredCut = 0.85,
verbose = 5,
networkType = "signed hybrid")
par(mfrow = c(1,2))
cex1 = 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1],
-sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",
type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1],
-sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,
cex=cex1,
col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1],
sft$fitIndices[,5],
xlab="Soft Threshold (power)",
ylab="Mean Connectivity",
type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
sft$fitIndices
sft$fitIndices[(-sign(sft$fitIndices[,3])*sft$fitIndices[,2])>=0.8,c(1,5)]
Block-wise network construction and module detection
soft_thresholding_power = 6
min_module_size = 65
mergeCutHeight = 0.35
# Block-wise network construction and module detection
bwnet = blockwiseModules(t(filtered_norm_counts),
checkMissingData = TRUE,
maxBlockSize = 11000,
power = soft_thresholding_power,
networkType = networkType,
TOMType = "signed",
numericLabels = TRUE,
saveTOMs = TRUE,
saveTOMFileBase = "norm_genes_TOM",
mergeCutHeight = mergeCutHeight,
verbose=3,
corType="bicor",
maxPOutliers=0,
minModuleSize = min_module_size,
reassignThreshold = 0,
deepSplit = 2)
Size of the modules (ME0: genes not assigned to a module) and number of genes in modules
# Modules
mod_sizes = table(bwnet$colors)
names(mod_sizes) = paste("ME", names(mod_sizes), sep="")
mod_sizes
module_nb = dim(table(bwnet$colors))-1
# Get genes that are in one module
mod_genes = rownames(filtered_norm_counts)[bwnet$colors>0]
Dendrogram and the module colors underneath the block
bwModuleColors = labels2colors(bwnet$colors)
plotDendroAndColors(bwnet$dendrograms[[1]],
bwModuleColors[bwnet$blockGenes[[1]]],
"Module colors",
main = "Gene dendrogram and module colors in block 1",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05)
pdf("../results/dge/modules/gene_dendogram_module_colors.pdf")
plotDendroAndColors(bwnet$dendrograms[[1]],
bwModuleColors[bwnet$blockGenes[[1]]],
"Module colors",
main = "Gene dendrogram and module colors in block 1",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05)
dev.off()
Associate module color to genes
gene_colors = bwnet$colors
names(gene_colors) = rownames(filtered_norm_counts)
head(gene_colors)
Recalculate MEs with color labels
MEs = moduleEigengenes(t(filtered_norm_counts), gene_colors)$eigengenes
Extract the palette for next plots
pal2 = labels2colors(sort(unique(bwnet$colors)))
names(pal2) = names(mod_sizes)
pal2
save(pal2, file = "../results/dge/dge_net_pal2.RData")
Module-trait correlation analysis between the module eigengene (ME) and the different trait (combination of Microbiota, age and sex)
head(MEs)
head(trait[['Age / Microbiota / Sex']])
c = cor(MEs, trait[['Age / Microbiota / Sex']] %>% select(-c(sample)), use = "p")
head(c)
plot_module_groups = function(trait, vertsep){
trait = trait %>% select(-c(sample))
# Calculate correlation
moduleTraitCor = cor(MEs, trait, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, dim(filtered_norm_counts)[2])
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
# Plot correlation
par(mar = c(6, 8.5, 3, 3))
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = colnames(trait),
ySymbols = paste(names(mod_sizes), " \n(", mod_sizes, " genes)", sep = ""),
yLabels = paste('ME', pal2, sep=''),
colorLabels = FALSE,
yColorLabels = TRUE,
colors = rev(brewer.pal(11, "RdBu")),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
cex.lab.y = .75,
zlim = c(-1,1),
verticalSeparator.x = vertsep)
}
plot_module_groups(trait[['Groups']], c(2,5))
plot_module_groups(trait[['Microbiota / Age']], 3)
plot_module_groups(trait[['Age / Microbiota / Sex']], c(4, 8))
pdf("../results/dge/modules/module_group_age_microbiota_sex.pdf")
plot_module_groups(trait[['Age / Microbiota / Sex']], c(4, 8))
dev.off()
apply(trait[['Age / Microbiota / Sex']] %>% select(-c(sample)),2, sum)
plot_module_groups(trait[['Sex / Age']], c(3))
plot_module_groups(trait[['Sex / Microbiota']], c(2))
Associate genes to modules
genes_in_modules_bool = sapply(0:module_nb, function(x) return(gene_colors == x))
colnames(genes_in_modules_bool) = names(mod_sizes)
genes_in_modules = lapply(names(mod_sizes), function(x) return(rownames(genes_in_modules_bool[genes_in_modules_bool[,x],])))
names(genes_in_modules) = names(mod_sizes)
capture.output(genes_in_modules, file = "../results/dge/modules/genes_in_modules")
save(genes_in_modules, file="../results/dge/genes_in_modules.RData")
for(x in names(mod_sizes)){
plot_z_score_heatmap(z_scores,
genes_in_modules[[x]],
col_order$msa,
annot_col$msa,
paste("Genes in ", x, sep=""),
col_order$msa)
}
plot_z_score_heatmap_with_modules(z_scores,
rownames(z_scores),
col_order$msa,
annot_col$msa,
genes_in_modules[c('ME1', 'ME2', 'ME4', 'ME7', 'ME8')],
"Genes in selected modules")
pdf('../results/dge/modules/z_score_selected_modules_msa.pdf')
plot_z_score_heatmap_with_modules(z_scores,
rownames(z_scores),
col_order$msa,
annot_col$msa,
genes_in_modules[c('ME1', 'ME2', 'ME4', 'ME7', 'ME8')],
"Genes in selected modules")
dev.off()
plot_z_score_heatmap_with_modules(z_scores,
rownames(z_scores),
col_order$msa,
annot_col$msa,
genes_in_modules,
"All genes in modules")
pdf('../results/dge/modules/z_score_modules_msa.pdf')
plot_z_score_heatmap_with_modules(z_scores,
rownames(z_scores),
col_order$msa,
annot_col$msa,
genes_in_modules,
"All genes in modules")
dev.off()
for(x in names(mod_sizes)){
plot_z_score_heatmap(z_scores,
genes_in_modules[[x]],
col_order$sma,
annot_col$sma,
paste("Genes in ", x, sep=""),
col_order$sma)
pdf(paste("../results/dge/modules/module_heatmap", x, ".pdf", sep=""))
plot_z_score_heatmap(z_scores,
genes_in_modules[[x]],
col_order$sma,
annot_col$sma,
paste("Genes in ", x, sep=""),
col_order$sma)
dev.off()
}
for(x in names(mod_sizes)){
plot_z_score_heatmap(z_scores,
genes_in_modules[[x]],
col_order$ams,
annot_col$ams,
paste("Genes in ", x, sep=""),
col_order$ams)
}
The mean of the Z-score over the samples in the group is plot for each gene
plot_z_score_sinaplot = function(g, trait, z_scores, module_nb, genes_in_modules, pdf = FALSE){
g_trait = trait[[g]]
for(x in names(mod_sizes)){
genes_in_mod = genes_in_modules[[x]]
z_scores_vec = c()
groups = c()
for(y in colnames(g_trait)[-1]){
samples = g_trait %>%
filter((!!as.name(y)) == 1) %>%
pull(sample)
mean_z_scores = apply(z_scores[genes_in_mod, samples], 1, mean)
z_scores_vec = c(z_scores_vec, mean_z_scores)
groups = c(groups, rep(y, length(mean_z_scores)))
}
sinaplot(z_scores_vec,
groups,
col=pal2[x],
pch = 20,
bty = "n",
main=paste("Z-scores for the ", mod_sizes[x], " genes in ", x, sep=""),
las=2,
ylab = "Mean Z-scores of the genes over samples",
cex.axis=0.75)
if(pdf){
pdf(paste("../results/dge/modules/module_sinaplot_", x, ".pdf", sep=""))
sinaplot(z_scores_vec,
groups,
col=pal2[x],
pch = 20,
bty = "n",
main = paste("Z-scores for genes in ", x, sep=""),
las=2,
ylab = "Mean Z-scores of the genes over samples",
cex.axis=0.75)
dev.off()
}
}
}
plot_z_score_sinaplot('Microbiota / Age', trait, z_scores, module_nb, genes_in_modules)
plot_z_score_sinaplot('Microbiota / Age / Sex', trait, z_scores, module_nb, genes_in_modules)
plot_z_score_sinaplot('Age / Microbiota / Sex', trait, z_scores, module_nb, genes_in_modules, pdf = TRUE)
x = names(mod_sizes)[1]
pwf = suppressMessages(nullp(genes_in_modules_bool[, x], 'mm10', 'geneSymbol', plot.fit=F))
for(x in names(mod_sizes)){
print(x)
pwf = suppressMessages(nullp(genes_in_modules_bool[, x], 'mm10', 'geneSymbol', plot.fit=F))
GO_wall = goseq(pwf, 'mm10', 'geneSymbol')
adj_GO_wall = GO_wall[p.adjust(GO_wall[,"over_represented_pvalue"], method="BH")<.05,]
print(head(adj_GO_wall[adj_GO_wall$ontology == "BP",]))
write.table(adj_GO_wall, paste("../results/dge/modules/", x, "_GO", sep=""), sep = "\t", quote = FALSE, row.names = FALSE)
}
citation("WGCNA")
citation("sinaplot")
toBibtex(citation("sinaplot"))
citation("goseq")
citation("org.Mm.eg.db")